package com.xsdlr.processor;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;

/**
 * Created by xsdlr on 2017/4/5.
 */
@Component
public class YouzyPageProcessor implements PageProcessor {
    Logger logger = LoggerFactory.getLogger(YouzyPageProcessor.class);

    private Site site = Site.me()
            .setRetryTimes(10)
            .setCycleRetryTimes(10)
            .setSleepTime(100);

    @Override
    public void process(Page page) {
        String id = page.getUrl().regex("id=(\\d+)").toString();
        String name = page.getUrl().regex("wd=(.*)\\&").toString();
        String imagePath = page.getHtml().xpath("//ul[contains(@class, 'uzy-college-list')]//img[1]/@src").toString();
        page.putField("id", id);
        page.putField("name", name);
        page.putField("img", imagePath);
        if (imagePath == null || imagePath.trim().length() == 0){
            logger.warn("学校图片信息不全,id:{},学校名称:{}", id, name);
            page.setSkip(true);
        } else {
            logger.info("id:{},学校名称:{},图片地址:{}", id, name, imagePath);
        }
    }

    @Override
    public Site getSite() {
        return this.site;
    }
}
